data processing
#formatting
data$trending_date<-as.Date(data$trending_date, "%y.%d.%m")
data$publish_time<-as.Date(data$publish_time)
data[,"thumbnail_link"]<-NULL
data<-subset(data,!(data$video_id=="kZete48ZtsY"&data$video_error_or_removed=="True"))
#aggregate
data1<-aggregate(data[,c("video_id")],
by=list(data$video_id,data$title,data$channel_title,data$category_id,data$publish_time,data$tags,data$comments_disabled,data$ratings_disabled,data$video_error_or_removed,data$description,data$channel_title),
FUN = length)
colnames(data1)<-c("video_id","title","channel_title","category_id","publish_time","tags","comments_disabled","ratings_disabled","video_error_or_removed","description","channel_title","count")
data10<-aggregate(data$video_id,by=list(data$video_id),length)
colnames(data10)<-c("video_id","trending_count")
process_list<- list("description","tags","title")
for(i in 1:nrow(data10)){
ifelse(i%%1000==0,print(paste(i," ",round(i/nrow(data10)*100,2),"%")),0)
temp<-subset(data,data$video_id==data10[i,"video_id"])
temp<-temp[order(temp$trending_date,decreasing = T),]
data10[i,"first_trending"]<-min(temp$trending_date)
data10[i,"last_trending"]<-max(temp$trending_date)
data10[i,"title"]<-temp[1,"title"]
data10[i,"title_change_count"]<-length(unique(temp$title))
data10[i,"tags"]<-temp[1,"tags"]
data10[i,"tag_change_count"]<-length(unique(temp$tags))
data10[i,"description"]<-temp[1,"description"]
data10[i,"des_change_count"]<-length(unique(temp$description))
data10[i,"category_id"]<-temp[1,"category_id"]
data10[i,"cat_change_count"]<-length(unique(temp$category_id))
data10[i,"channel_title"]<-temp[1,"channel_title"]
data10[i,"chn_change_count"]<-length(unique(temp$channel_title))
data10[i,"publish_time"]<-min(temp$publish_time)
data10[i,"views"]<-temp[1,"views"]
data10[i,"max_view"]<-max(temp$views)
data10[i,"min_view"]<-min(temp$views)
data10[i,"likes"]<-temp[1,"likes"]
data10[i,"max_likes"]<-max(temp$likes)
data10[i,"min_likes"]<-min(temp$likes)
data10[i,"dislikes"]<-temp[1,"dislikes"]
data10[i,"max_dislikes"]<-max(temp$dislikes)
data10[i,"min_dislikes"]<-min(temp$dislikes)
data10[i,"comment_count"]<-temp[1,"comment_count"]
data10[i,"max_comments"]<-max(temp$comment_count)
data10[i,"min_comments"]<-min(temp$comment_count)
data10[i,"comments_disabled"]<-temp[1,"comments_disabled"]
data10[i,"ratings_disabled"]<-temp[1,"ratings_disabled"]
data10[i,"video_error_or_removed"]<-temp[1,"video_error_or_removed"]
}
## [1] "1000 15.75 %"
## [1] "2000 31.49 %"
## [1] "3000 47.24 %"
## [1] "4000 62.98 %"
## [1] "5000 78.73 %"
## [1] "6000 94.47 %"
data10$description<-str_replace_all(data10$description,"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+","EXTERNALURL ")
data10$description<-str_replace_all(data10$description,"(\\\\r\\n|\\\\r|\\\\n)"," ")
data10$des_url_count<-unlist(lapply(data10$description,function(x){length(str_extract_all(x,"EXTERNALURL")[[1]])}))
data10$description<-str_replace_all(data10$description,"EXTERNALURL"," ")
for (j in process_list){
data10[[paste(j,"P")]] <- gsub("'", "", data10[[j]]) # remove apostrophes
data10[[paste(j,"P")]] <- gsub("[[:punct:]]", " ", data10[[j]]) # replace punctuation with space
data10[[paste(j,"P")]] <- gsub("[[:cntrl:]]", " ", data10[[j]]) # replace control characters with space
data10[[paste(j,"P")]] <- gsub("^[[:space:]]+", "", data10[[j]]) # remove whitespace at beginning of documents
data10[[paste(j,"P")]] <- gsub("[[:space:]]+$", "", data10[[j]]) # remove whitespace at end of documents
data10[[paste(j,"P")]] <- gsub("[^a-zA-Z -]", " ", data10[[j]]) # allows only letters
data10[[paste(j,"P")]] <- tolower(data10[[j]])
}
data10$trending_span = as.numeric(data10$last_trending-data10$first_trending)
data10$day_until_trending = as.numeric(data10$first_trending-data10$publish_time)
data10$like_rate=round(data10$likes/(data10$likes+data10$dislikes)*100,2)
data10$like_engage_rate=round((data10$likes+data10$dislikes)/data10$views*100,2)
data10$comment_engage_rate=round(data10$comment_count/data10$views*100,2)
data10$view_increment=ifelse(data10$trending_span>0,round((data10$max_view-data10$min_view)/data10$trending_span,2),0)
data10$like_increment=ifelse(data10$trending_span>0,round((data10$max_likes-data10$min_likes)/data10$trending_span,2),0)
data10$dislike_increment=ifelse(data10$trending_span>0,round((data10$max_dislikes-data10$min_dislikes)/data10$trending_span,2),0)
data10$comment_increment=ifelse(data10$trending_span>0,round((data10$max_comments-data10$min_comments)/data10$trending_span,2),0)
data10$engagement_score<-50*data10$comment_count/25000 +20*(data10$likes+data10$dislikes)/500000-5*data10$day_until_trending+10*data10$view_increment/250 +5*data10$views/50000000
cat_dict<-c("Film & Animation","Autos & Vehicles","Music","Pets & Animals","Sports","Short Movies","Travel & Events","Gaming","Videoblogging","People & Blogs","Comedy","Entertainment","News & Politics","Howto & Style","Education","Science & Technology","Nonprofits & Activism","Movies","Anime/Animation","Action/Adventure","Classics","Comedy","Documentary","Drama","Family","Foreign","Horror","Sci-Fi/Fantasy","Thriller","Shorts","Shows","Trailers")
names(cat_dict)<-c(1,2,10,15,17:44)
data10$category_name<-unlist(lapply(data10$category_id,function(x){
return(cat_dict[[toString(x)]])
}))
colnames(data10)
## [1] "video_id" "trending_count" "first_trending"
## [4] "last_trending" "title" "title_change_count"
## [7] "tags" "tag_change_count" "description"
## [10] "des_change_count" "category_id" "cat_change_count"
## [13] "channel_title" "chn_change_count" "publish_time"
## [16] "views" "max_view" "min_view"
## [19] "likes" "max_likes" "min_likes"
## [22] "dislikes" "max_dislikes" "min_dislikes"
## [25] "comment_count" "max_comments" "min_comments"
## [28] "comments_disabled" "ratings_disabled" "video_error_or_removed"
## [31] "des_url_count" "description P" "tags P"
## [34] "title P" "trending_span" "day_until_trending"
## [37] "like_rate" "like_engage_rate" "comment_engage_rate"
## [40] "category_name"
temp <- data10$first_trending
temp<-data.frame(temp)
temp$pub <-data10$publish_time
temp$diff<-data10$day_until_trending
temp$PUByearmonth<-format(temp$pub,"%Y-%m")
temp$TRDyearmonth<-format(temp$temp,"%Y-%m")
ggplot(data = temp[temp$PUByearmonth<"2017-10",],mapping = aes(x=PUByearmonth))+geom_bar()+theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("first publish date (before 2017-10)")

ggplot(data = temp[temp$PUByearmonth>="2017-10",],mapping = aes(x=PUByearmonth))+geom_bar()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("first publish date (after 2017-10)(included)")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = temp,mapping = aes(x=TRDyearmonth))+geom_bar()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("first trending date")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,mapping = aes(x=trending_count))+geom_bar()+
ggtitle("how many times on trending")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,mapping = aes(x=trending_span))+geom_bar()+
ggtitle("how many days between first and last trending")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

newscorpus_des<- corpus(data10$`description P`,
docnames=data10$video_id,
docvar=data.frame(pos=data10$publish_time,
date=data10$first_trending,
loc = data10$views))
newscorpus_tag<- corpus(data10$`tags P`,
docnames=data10$video_id,
docvar=data.frame(pos=data10$publish_time,
date=data10$first_trending,
loc = data10$views))
newscorpus_title<- corpus(data10$`title P`,
docnames=data10$video_id,
docvar=data.frame(pos=data10$publish_time,
date=data10$first_trending,
loc = data10$views))
newscorpus<-newscorpus_des
dfm.simple<- dfm(newscorpus,
remove = c(stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.simple, n=50)
## , : . â \200 - ! \231
## 36786 33763 23002 16889 16306 15329 9842 5630
## ¢ ) ( / _ video " s
## 5161 4406 4227 4016 3551 2937 2932 2713
## subscribe ? us twitter º music facebook '
## 2689 2547 2448 2373 2315 2284 2264 2252
## videos new instagram | show & get »
## 2214 2186 2185 2174 2158 2153 2063 1878
## watch youtube channel news like * jimmy follow
## 1821 1800 1734 1661 1531 1513 1486 1456
## live ì late can now ë will t
## 1436 1405 1369 1303 1256 1168 1135 1133
## one website
## 1085 1013
swlist = c(",", ":",".", "â", "€","-","!","™","¢",")","(","/","_","\"","s","º","\'","|","&","»","ë","ì","t","can","˜","¸","+","ð","ðÿ","¶","¡","linebreak","„","âž","°")
dfm.stem<- dfm(newscorpus,
remove = c(swlist,stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.stem, n=50)
## video subscribe ? us twitter music facebook
## 2937 2689 2547 2448 2373 2284 2264
## videos new instagram show get watch youtube
## 2214 2186 2185 2158 2063 1821 1800
## channel news like * jimmy follow live
## 1734 1661 1531 1513 1486 1456 1436
## late now will one website make love
## 1369 1256 1135 1085 1013 1003 982
## first world time see night just production
## 976 941 898 884 880 856 852
## official use cbs check full know kimmel
## 850 849 849 840 837 808 797
## latest nbc best day go life every
## 791 761 753 748 743 740 735
## visit
## 732
set.seed(333) #keeps cloud' shape fixed
freq<-topfeatures(dfm.stem, n=500)
wordcloud(names(freq),
freq, max.words=100,
scale=c(3, .3),
colors=brewer.pal(8, "Dark2"))

newscorpus<-newscorpus_tag
dfm.simple<- dfm(newscorpus,
remove = c(stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.simple, n=50)
## | ¤ à show video
## 120447 1738 1678 1424 1391
## funny news new makeup 2018
## 1348 1138 1031 978 953
## music comedy late live ì
## 891 886 760 751 683
## food trailer best movie ¥
## 674 619 616 590 580
## youtube . \200 night 2017
## 578 548 541 511 511
## ë game ellen interview tv
## 505 503 496 492 486
## james first vs star official
## 478 470 461 447 440
## black review videos life world
## 438 435 433 431 426
## = season challenge celebrity (
## 411 407 405 399 397
## talk ) beauty entertainment nbc
## 396 393 389 387 387
swlist = c("‚", ":",".", "à ", "€","-","!","¤","¢",")","(","/","_","\"","s","?","º","\'","|","&","»","ë","ì","t","can","˜","¸","+","¥","=","¶","¡","linebreak","„","âž","°")
dfm.stem<- dfm(newscorpus,
remove = c(swlist,stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.stem, n=50)
## show video funny news new
## 1424 1391 1348 1138 1031
## makeup 2018 music comedy late
## 978 953 891 886 760
## live food trailer best movie
## 751 674 619 616 590
## youtube night 2017 game ellen
## 578 511 511 503 496
## interview tv james first vs
## 492 486 478 470 461
## star official black review videos
## 447 440 438 435 433
## life world season challenge celebrity
## 431 426 407 405 399
## talk beauty entertainment nbc iphone
## 396 389 387 387 385
## tutorial smith cat super diy
## 384 370 360 358 351
## make stephen nba science movies
## 346 345 341 340 338
set.seed(142) #keeps cloud' shape fixed
freq<-topfeatures(dfm.stem, n=500)
wordcloud(names(freq),
freq, max.words=120,
scale=c(3, .3),
colors=brewer.pal(8, "Dark2"))

newscorpus<-newscorpus_title
dfm.simple<- dfm(newscorpus,
remove = c(stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.simple, n=50)
## | - ) ( . ! : ,
## 1840 1591 944 941 879 840 726 698
## \200 official ' ? 2018 & video trailer
## 518 489 455 377 364 322 317 276
## \231 â [ ] / 2017 vs new
## 257 226 208 208 183 179 178 170
## s ft live first hd 2 " audio
## 159 155 144 140 139 135 131 122
## music day game show makeup star $ full
## 115 112 108 106 104 95 95 94
## 10 black espn 1 5 best will challenge
## 93 92 90 90 89 86 83 83
## christmas make
## 82 82
swlist = c(",", ":",".", "à ", "€","-","!","¤","¢",")","(","/","_","\"","ë","?","º","\'","|","&","[","]","ì","ðÿ","™","˜","¸","+","¥","=","¶","¡","â","„","©","$")
dfm.stem<- dfm(newscorpus,
remove = c(swlist,stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.stem, n=50)
## official 2018 video trailer 2017 vs new
## 489 364 317 276 179 178 170
## s ft live first hd 2 audio
## 159 155 144 140 139 135 122
## music day game show makeup star full
## 115 112 108 106 104 95 94
## 10 black espn 1 5 best will
## 93 92 90 90 89 86 83
## challenge christmas make movie james news season
## 83 82 82 81 80 78 75
## time super wars world last highlights love
## 74 73 72 71 70 68 68
## 3 top netflix john get one x
## 67 66 66 64 64 64 63
## bowl
## 62
set.seed(142) #keeps cloud' shape fixed
freq<-topfeatures(dfm.stem, n=500)
wordcloud(names(freq),
freq, max.words=200,
scale=c(3, .3),
colors=brewer.pal(8, "Dark2"))

ggplot(data = data10,aes(category_name))+geom_bar()+ theme(axis.text.x = element_text(angle = 90, hjust = 1))+ggtitle("category distribution")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

swlist = c("‚", ":",".", "à ", "€","-","!","¤","¢",")","(","/","_","\"","s","º","\'","|","&","»","ë","ì","t","can","˜","¸","+","¥","=","¶","¡","linebreak","„","âž","°","ð","ñ")
data30<-data.frame(cat_dict)
for(i in 1:nrow(data30)){
#i=2
#ifelse(i%%100==0,print(paste(i," ",round(i/nrow(data20)*100,2),"%")),0)
temp<-subset(data10,data10$category_name==data30[i,1])
if(nrow(temp)==0){
next
}
newscorpus_temp<- corpus(temp$`tags P`,docnames=temp$video_id)
dfm.stem<- dfm(newscorpus_temp,
remove = c(swlist,stopwords("english")),
verbose=F,stem=F)
temp_list<-topfeatures(dfm.stem, n=5)
data30[i,"top_tag"]<-ifelse(temp_list[1]>=1,names(temp_list[1]),NA)
data30[i,"second_tag"]<-ifelse(length(temp_list>1)&temp_list[2]>=1,names(temp_list[2]),NA)
data30[i,"third_tag"]<-ifelse(length(temp_list>2)&temp_list[3]>=1,names(temp_list[3]),NA)
data30[i,"fourth_tag"]<-ifelse(length(temp_list>3)&temp_list[4]>=1,names(temp_list[4]),NA)
}
data30[!is.na(data30$top_tag),]
## cat_dict top_tag second_tag third_tag fourth_tag
## 1 Film & Animation trailer movie film clark
## 2 Autos & Vehicles car super bowl commercial
## 10 Music music video new official
## 15 Pets & Animals cat dog cats animals
## 17 Sports nba first espn highlights
## 19 Travel & Events food street travel best
## 20 Gaming game nintendo gameplay fnaf
## 22 People & Blogs buzzfeed safiya fashion video
## 23 Comedy funny comedy show video
## 24 Entertainment show late funny ellen
## 25 News & Politics news video trump today
## 26 Howto & Style makeup beauty food tutorial
## 27 Education life noggin science education
## 28 Science & Technology iphone science x tech
## 29 Nonprofits & Activism homeless logan paul suicide
## 34 Comedy funny comedy show video
## 43 Shows iphone apple plus home
#continuos use histogram
#cat without y use bar
#cat with y use col
ggplot(data = data10,aes(trending_count))+geom_bar()+
ggtitle("How many days a video is trending")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(title_change_count))+geom_bar()+
ggtitle("times a video changed titles")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(tag_change_count))+geom_bar()+
ggtitle("times a video changed tags")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(des_change_count))+geom_bar()+
ggtitle("times a video changed description")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(cat_change_count))+geom_bar()+
ggtitle("times a video changed category")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(chn_change_count))+geom_bar()+
ggtitle("times a video's owner change vhannel title")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(views))+geom_histogram(bins=15)+
ggtitle("views distribution")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$views<=15000000,],aes(views))+geom_histogram(bins=15)+
ggtitle("views distribution <=15000000")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$views>15000000,],aes(views))+geom_histogram(bins=15)+
ggtitle("views distribution >15000000")+xlim(15000000,max(data10$views))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10[data10$likes<=100000,],aes(likes))+geom_histogram(bins=15)+
ggtitle("likes distribution <=100000")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$likes>100000,],aes(likes))+geom_histogram(bins=15)+
ggtitle("likes distribution >100000")+xlim(100000,max(data10$likes))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10[data10$dislikes<=100000,],aes(dislikes))+geom_histogram(bins=15)+
ggtitle("dislikes distribution <=100000")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$dislikes>100000,],aes(dislikes))+geom_histogram(bins=15)+
ggtitle("dislikes distribution >100000")+xlim(100000,max(data10$dislikes))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10[data10$comment_count<=100000,],aes(comment_count))+geom_histogram(bins=15)+
ggtitle("comment_count distribution <=100000")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$comment_count>100000,],aes(comment_count))+geom_histogram(bins=15)+
ggtitle("comment_count distribution >100000")+xlim(100000,max(data10$comment_count))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10,aes(comments_disabled))+geom_bar()+
ggtitle("is the video disabled comments")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(ratings_disabled))+geom_bar()+
ggtitle("is the video disabled ratings")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(video_error_or_removed))+geom_bar()+
ggtitle("is the video disabled removed")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(des_url_count))+geom_histogram()+
ggtitle("how many external links in the description")+
stat_bin( geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data10,aes(trending_span))+geom_histogram()+
ggtitle("how many days between first and last day trending")+
stat_bin( geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data10,aes(log2(day_until_trending)))+geom_histogram()+
ggtitle("how many days until trending log2")+
stat_bin( geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 120 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 120 rows containing non-finite values (stat_bin).

ggplot(data = data10[data10$day_until_trending<=100,],aes(day_until_trending))+geom_histogram(bins=15)+
ggtitle("how many days until trending <=100")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$day_until_trending>100,],aes(day_until_trending))+geom_histogram(bins=15)+
ggtitle("how many days until trending >100")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10,aes(like_rate))+geom_histogram()+
ggtitle("video like rate distribution")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 31 rows containing non-finite values (stat_bin).
## Warning: Removed 31 rows containing non-finite values (stat_count).

ggplot(data = data10,aes(like_engage_rate))+geom_histogram()+
ggtitle("percentage of like interaction distribution")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data10,aes(comment_engage_rate))+geom_histogram()+
ggtitle("percentage of comments interaction distribution")+
stat_count(geom = "text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data10[data10$view_increment<=1000000,],aes(view_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(<=1000000)")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$view_increment>1000000,],aes(view_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(>1000000)")+xlim(1000000,max(data10$view_increment))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10[data10$like_increment<=3000,],aes(like_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(<=3000)")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$like_increment>3000,],aes(like_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(>3000)")+xlim(3000,max(data10$like_increment))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10[data10$dislike_increment<=1000,],aes(dislike_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(<=1000)")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$dislike_increment>1000,],aes(dislike_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(>1000)")+xlim(1000,max(data10$dislike_increment))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10[data10$comment_increment<=1000,],aes(comment_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(<=1000)")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

ggplot(data = data10[data10$comment_increment>1000,],aes(comment_increment))+geom_histogram(bins=15)+
ggtitle("average view increment when trending(>1000)")+xlim(1000,max(data10$comment_increment))+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

ggplot(data = data10,aes(engagement_score))+geom_histogram(bins=15)+
ggtitle("engagement score distribution")+
stat_bin(bins=15, geom="text", colour = "black", size = 3.5,aes(label = ..count..),vjust=-0.3)

#youtuberAG_list<-c("channel_title","min_view","min_likes","min_dislikes","min_comments")
temp_avg<-c("trending_count","title_change_count","tag_change_count","des_change_count","cat_change_count","views","likes","dislikes","comment_count","des_url_count","trending_span","day_until_trending","like_rate","like_engage_rate","comment_engage_rate","engagement_score","view_increment","like_increment","dislike_increment","comment_increment")
temp_max<-c("max_view","max_likes","max_dislikes","max_comments","trending_span","engagement_score","view_increment","like_increment","dislike_increment","comment_increment")
temp_sum<-c("views","comment_count","likes")
temp_top<-c("tags","category_name")
data20<- aggregate(data10$video_id,by=list(data10$channel_title),FUN=length)
colnames(data20)<-c("channel","trend_video_count")
swlist = c("‚", ":",".", "à ", "€","-","!","¤","¢",")","(","/","_","\"","s","º","\'","|","&","»","ë","ì","t","ìš","˜","¸","+","¥","=","¶","¡","¬","„","ãƒ","°","ã",
"[","ð","´")
for(i in 1:nrow(data20)){
#i=2
ifelse(i%%100==0,print(paste(i," ",round(i/nrow(data20)*100,2),"%")),0)
temp<-subset(data10,data10$channel_title==data20[i,"channel"])
#temp<-temp[order(temp$trending_date,decreasing = T),]
for(j in temp_avg){
data20[i,paste("avg_",j,sep = "")]<-mean(temp[[j]])
}
for(j in temp_max){
data20[i,paste("max_",j,sep = "")]<-max(temp[[j]])
}
for(j in temp_sum){
data20[i,paste("sum_",j,sep = "")]<-sum(temp[[j]])
}
newscorpus_temp<- corpus(temp$`tags P`,docnames=temp$video_id)
dfm.stem<- dfm(newscorpus_temp,
remove = c(swlist,stopwords("english")),
verbose=F,stem=F)
temp_list<-topfeatures(dfm.stem, n=2)
data20[i,"top_tag"]<-ifelse(temp_list[1]>=1,names(temp_list[1]),NA)
data20[i,"second_tag"]<-ifelse(length(temp_list>1)&temp_list[2]>=1,names(temp_list[2]),NA)
temp_list<-table(temp$category_name)
data20[i,"top_cat"]<-names(temp_list[1])
data20[i,"second_cat"]<-ifelse(length(temp_list>1),names(temp_list[2]),NA)
}
## [1] "100 4.55 %"
## [1] "200 9.1 %"
## [1] "300 13.64 %"
## [1] "400 18.19 %"
## [1] "500 22.74 %"
## [1] "600 27.29 %"
## [1] "700 31.83 %"
## [1] "800 36.38 %"
## [1] "900 40.93 %"
## [1] "1000 45.48 %"
## [1] "1100 50.02 %"
## [1] "1200 54.57 %"
## [1] "1300 59.12 %"
## [1] "1400 63.67 %"
## [1] "1500 68.21 %"
## [1] "1600 72.76 %"
## [1] "1700 77.31 %"
## [1] "1800 81.86 %"
## [1] "1900 86.4 %"
## [1] "2000 90.95 %"
## [1] "2100 95.5 %"
#only 101 have trending in more than 1 cat
colnames(data20)
## [1] "channel" "trend_video_count"
## [3] "avg_trending_count" "avg_title_change_count"
## [5] "avg_tag_change_count" "avg_des_change_count"
## [7] "avg_cat_change_count" "avg_views"
## [9] "avg_likes" "avg_dislikes"
## [11] "avg_comment_count" "avg_des_url_count"
## [13] "avg_trending_span" "avg_day_until_trending"
## [15] "avg_like_rate" "avg_like_engage_rate"
## [17] "avg_comment_engage_rate" "max_max_view"
## [19] "max_max_likes" "max_max_dislikes"
## [21] "max_max_comments" "max_trending_span"
## [23] "sum_views" "sum_comment_count"
## [25] "sum_likes" "top_tag"
## [27] "second_tag" "top_cat"
## [29] "second_cat"
temp<-data20[order(-data20$trend_video_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,trend_video_count),y=trend_video_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many times its video is trending")+
geom_text(aes(label=round(trend_video_count,0),hjust=ifelse(trend_video_count<max(temp$trend_video_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$trend_video_count,function(x){return(ifelse(x<max(temp[1:25,]$trend_video_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_trending_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_trending_count),y=avg_trending_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many days its trending video will stay trending(average)")+
geom_text(aes(label=round(avg_trending_count,0),hjust=ifelse(avg_trending_count<max(temp$avg_trending_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_trending_count,function(x){return(ifelse(x<max(temp[1:25,]$avg_trending_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_title_change_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_title_change_count),y=avg_title_change_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many times the it changed it's trending video title")+
geom_text(aes(label=round(avg_title_change_count,0),hjust=ifelse(avg_title_change_count<max(temp$avg_title_change_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_title_change_count,function(x){return(ifelse(x<max(temp[1:25,]$avg_title_change_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_tag_change_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_tag_change_count),y=avg_tag_change_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many times the it changed it's trending video tag")+
geom_text(aes(label=round(avg_tag_change_count,2),hjust=ifelse(avg_tag_change_count<max(temp$avg_tag_change_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_tag_change_count,function(x){return(ifelse(x<max(temp[1:25,]$avg_tag_change_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_des_change_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_des_change_count),y=avg_des_change_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many times the it changed it's trending video description")+
geom_text(aes(label=round(avg_des_change_count,2),hjust=ifelse(avg_des_change_count<max(temp$avg_des_change_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_des_change_count,function(x){return(ifelse(x<max(temp[1:25,]$avg_des_change_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_cat_change_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_trending_count),y=avg_cat_change_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many times the it changed it's trending video category")+
geom_text(aes(label=round(avg_cat_change_count,2),hjust=ifelse(avg_cat_change_count<max(temp$avg_cat_change_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_cat_change_count,function(x){return(ifelse(x<max(temp[1:25,]$avg_cat_change_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_views,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_views),y=avg_views))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many view will its trending video get")+
geom_text(aes(label=round(avg_views,0),hjust=ifelse(avg_views<max(temp$avg_views)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_views,function(x){return(ifelse(x<max(temp[1:25,]$avg_views)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_likes,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_views),y=avg_views))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many likes will its trending video get")+
geom_text(aes(label=round(avg_views,0),hjust=ifelse(avg_views<max(temp$avg_views)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_views,function(x){return(ifelse(x<max(temp[1:25,]$avg_views)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_dislikes,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_views),y=avg_views))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many dislikes will its trending video get")+
geom_text(aes(label=round(avg_views,0),hjust=ifelse(avg_views<max(temp$avg_views)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_views,function(x){return(ifelse(x<max(temp[1:25,]$avg_views)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_comment_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_comment_count),y=avg_comment_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many comments will its trending video get")+
geom_text(aes(label=round(avg_comment_count,0),hjust=ifelse(avg_comment_count<max(temp$avg_comment_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_comment_count,function(x){return(ifelse(x<max(temp[1:25,]$avg_comment_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_des_url_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_des_url_count),y=avg_des_url_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many URL will its trending video have in the description")+
geom_text(aes(label=round(avg_des_url_count,0),hjust=ifelse(avg_des_url_count<max(temp$avg_des_url_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_des_url_count,function(x){return(ifelse(x<max(temp[1:25,]$avg_des_url_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_trending_span,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_trending_span),y=avg_trending_span))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many days will its trending video can last")+
geom_text(aes(label=round(avg_trending_span,0),hjust=ifelse(avg_trending_span<max(temp$avg_trending_span)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_trending_span,function(x){return(ifelse(x<max(temp[1:25,]$avg_trending_span)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_day_until_trending,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_day_until_trending),y=avg_day_until_trending))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many days will its trending video need to become trending(average)")+
geom_text(aes(label=round(avg_day_until_trending,0),hjust=ifelse(avg_day_until_trending<max(temp$avg_day_until_trending)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_day_until_trending,function(x){return(ifelse(x<max(temp[1:25,]$avg_day_until_trending)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_like_rate,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_like_rate),y=avg_like_rate))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many viewers likes its trending video (average)")+
geom_text(aes(label=round(avg_like_rate,0),hjust=ifelse(avg_like_rate<max(temp$avg_like_rate)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_like_rate,function(x){return(ifelse(x<max(temp[1:25,]$avg_like_rate)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_like_engage_rate,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_like_engage_rate),y=avg_like_engage_rate))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many viewer will click its trending video's like / dislike")+
geom_text(aes(label=round(avg_like_engage_rate,2),hjust=ifelse(avg_like_engage_rate<max(temp$avg_like_engage_rate)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_like_engage_rate,function(x){return(ifelse(x<max(temp[1:25,]$avg_like_engage_rate)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_comment_engage_rate,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_comment_engage_rate),y=avg_comment_engage_rate))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("How many viewer will leave comments on its trending video")+
geom_text(aes(label=round(avg_comment_engage_rate,2),hjust=ifelse(avg_comment_engage_rate<max(temp$avg_comment_engage_rate)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_comment_engage_rate,function(x){return(ifelse(x<max(temp[1:25,]$avg_comment_engage_rate)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_max_view,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_max_view),y=max_max_view))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("channnel")+
geom_text(aes(label=round(max_max_view,0),hjust=ifelse(max_max_view<max(temp$max_max_view)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_max_view,function(x){return(ifelse(x<max(temp[1:25,]$max_max_view)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_max_likes,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_max_likes),y=max_max_likes))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many likes this youtuber ever have on single trending video")+
geom_text(aes(label=round(max_max_likes,0),hjust=ifelse(max_max_likes<max(temp$max_max_likes)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_max_likes,function(x){return(ifelse(x<max(temp[1:25,]$max_max_likes)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_max_dislikes,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_max_dislikes),y=max_max_dislikes))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many dislikes this youtuber ever have on single trending video")+
geom_text(aes(label=round(max_max_dislikes,0),hjust=ifelse(max_max_dislikes<max(temp$max_max_dislikes)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_max_dislikes,function(x){return(ifelse(x<max(temp[1:25,]$max_max_dislikes)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_max_comments,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_max_comments),y=max_max_comments))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many comments this youtuber ever have on single trending video")+
geom_text(aes(label=round(max_max_comments,0),hjust=ifelse(max_max_comments<max(temp$max_max_comments)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_max_comments,function(x){return(ifelse(x<max(temp[1:25,]$max_max_comments)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_trending_span,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_trending_span),y=max_trending_span))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many days its long lasting video stay trending")+
geom_text(aes(label=round(max_trending_span,0),hjust=ifelse(max_trending_span<max(temp$max_trending_span)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_trending_span,function(x){return(ifelse(x<max(temp[1:25,]$max_trending_span)/1.5, "black","white"))})))

temp<-data20[order(-data20$sum_views,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,sum_views),y=sum_views))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many view this youtuber have for all its trending video")+
geom_text(aes(label=round(sum_views,0),hjust=ifelse(sum_views<max(temp$sum_views)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$sum_views,function(x){return(ifelse(x<max(temp[1:25,]$sum_views)/1.5, "black","white"))})))

temp<-data20[order(-data20$sum_comment_count,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,sum_comment_count),y=sum_comment_count))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many comments this youtuber have for all its trending video")+
geom_text(aes(label=round(sum_comment_count,0),hjust=ifelse(sum_comment_count<max(temp$sum_comment_count)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$sum_comment_count,function(x){return(ifelse(x<max(temp[1:25,]$sum_comment_count)/1.5, "black","white"))})))

temp<-data20[order(-data20$sum_likes,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,sum_likes),y=sum_likes))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("how many likes this youtuber have for all its trending video")+
geom_text(aes(label=round(sum_likes,0),hjust=ifelse(sum_likes<max(temp$sum_likes)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$sum_likes,function(x){return(ifelse(x<max(temp[1:25,]$sum_likes)/1.5, "black","white"))})))

temp<-aggregate(data20$top_tag,by=list(data20$top_tag),FUN=length)
temp<-temp[order(-temp$x,decreasing = F),]
ggplot(data = temp[1:30,],mapping = aes(x=reorder(Group.1,-x),y=x))+geom_col()+ theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("most common tags among youtubers")+
geom_text(geom = "text", colour = "black", size = 3.5,aes(label = x),vjust=-0.3)
## Warning: Ignoring unknown parameters: geom

temp<-aggregate(data20$top_cat,by=list(data20$top_cat),FUN=length)
temp<-temp[order(-temp$x,decreasing = F),]
ggplot(data = temp[1:16,],mapping = aes(x=reorder(Group.1,-x),y=x))+geom_col()+ theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle("most common category among youtubers")+
geom_text(geom = "text", colour = "black", size = 3.5,aes(label = x),vjust=-0.3)
## Warning: Ignoring unknown parameters: geom

temp<-data20[order(-data20$avg_engagement_score,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_engagement_score),y=avg_engagement_score))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest average engagement score")+
geom_text(aes(label=round(avg_engagement_score,0),hjust=ifelse(avg_engagement_score<max(temp$avg_engagement_score)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_engagement_score,function(x){return(ifelse(x<max(temp[1:25,]$avg_engagement_score)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_engagement_score,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_engagement_score),y=max_engagement_score))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest singel video engagement score")+
geom_text(aes(label=round(max_engagement_score,0),hjust=ifelse(max_engagement_score<max(temp$max_engagement_score)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_engagement_score,function(x){return(ifelse(x<max(temp[1:25,]$max_engagement_score)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_view_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_view_increment),y=avg_view_increment))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest average view increment when trending")+
geom_text(aes(label=round(avg_view_increment,0),hjust=ifelse(avg_view_increment<max(temp$avg_view_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_view_increment,function(x){return(ifelse(x<max(temp[1:25,]$avg_view_increment)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_view_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_view_increment),y=max_view_increment))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest singel video view increment")+
geom_text(aes(label=round(max_view_increment,0),hjust=ifelse(max_view_increment<max(temp$max_view_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_view_increment,function(x){return(ifelse(x<max(temp[1:25,]$max_view_increment)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_like_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_like_increment),y=avg_like_increment))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest average like increment when trending")+
geom_text(aes(label=round(avg_like_increment,0),hjust=ifelse(avg_like_increment<max(temp$avg_like_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_like_increment,function(x){return(ifelse(x<max(temp[1:25,]$avg_like_increment)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_like_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_like_increment),y=max_like_increment))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest singel video like increment")+
geom_text(aes(label=round(max_like_increment,0),hjust=ifelse(max_like_increment<max(temp$max_like_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_like_increment,function(x){return(ifelse(x<max(temp[1:25,]$max_like_increment)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_dislike_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_dislike_increment),y=avg_dislike_increment))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest average dislike increment when trending")+
geom_text(aes(label=round(avg_dislike_increment,0),hjust=ifelse(avg_dislike_increment<max(temp$avg_dislike_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_dislike_increment,function(x){return(ifelse(x<max(temp[1:25,]$avg_dislike_increment)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_dislike_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_dislike_increment),y=max_dislike_increment))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest singel video dislike increment")+
geom_text(aes(label=round(max_dislike_increment,0),hjust=ifelse(max_dislike_increment<max(temp$max_dislike_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_dislike_increment,function(x){return(ifelse(x<max(temp[1:25,]$max_dislike_increment)/1.5, "black","white"))})))

temp<-data20[order(-data20$avg_comment_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,avg_comment_increment),y=avg_comment_increment))+geom_col()+coord_flip()+xlab(element_blank())+ylab(element_blank())+
ggtitle("top youtubers with highest average comment increment when trending")+
geom_text(aes(label=round(avg_comment_increment,0),hjust=ifelse(avg_comment_increment<max(temp$avg_comment_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$avg_comment_increment,function(x){return(ifelse(x<max(temp[1:25,]$avg_comment_increment)/1.5, "black","white"))})))

temp<-data20[order(-data20$max_comment_increment,decreasing = F),]
ggplot(data = temp[1:25,],mapping = aes(x=reorder(channel,max_comment_increment),y=max_comment_increment))+geom_col()+
ggtitle("top youtubers with highest singel video comment increment")+coord_flip()+xlab(element_blank())+ylab(element_blank())+
geom_text(aes(label=round(max_comment_increment,0),hjust=ifelse(max_comment_increment<max(temp$max_comment_increment)/1.5, -0.1, 1.1)),size=3,color=unlist(lapply(temp[1:25,]$max_comment_increment,function(x){return(ifelse(x<max(temp[1:25,]$max_comment_increment)/1.5, "black","white"))})))

plotcloud<-function(temp){
newscorpus_des<- corpus(temp$`description P`,docnames=temp$video_id)
newscorpus_tag<- corpus(temp$`tags P`,docnames=temp$video_id)
newscorpus_title<- corpus(temp$`title P`,docnames=temp$video_id)
newscorpus<-newscorpus_title
swlist = c(",", ":",".", "à ", "€","-","!","¤","¢",")","(","/","_","\"","ë","?","º","\'","|","&","[","]","ì","ðÿ","™","˜","¸","+","¥","=","¶","¡","â","„","©","$")
dfm.stem<- dfm(newscorpus,
remove = c(swlist,stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.stem, n=50)
set.seed(142) #keeps cloud' shape fixed
freq<-topfeatures(dfm.stem, n=500)
wordcloud(names(freq),
freq, max.words=200,
scale=c(3, .3),
colors=brewer.pal(8, "Dark2"),)
newscorpus<-newscorpus_des
swlist = c(",", ":",".", "â", "€","-","!","™","¢",")","(","/","_","\"","s","º","\'","|","&","»","ë","ì","t","can","˜","¸","+","ð","ðÿ","¶","¡","linebreak","„","âž","°")
dfm.stem<- dfm(newscorpus,
remove = c(swlist,stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.stem, n=50)
set.seed(333) #keeps cloud' shape fixed
freq<-topfeatures(dfm.stem, n=500)
wordcloud(names(freq),
freq, max.words=100,
scale=c(3, .3),
colors=brewer.pal(8, "Dark2"))
newscorpus<-newscorpus_tag
swlist = c("‚", ":",".", "à ", "€","-","!","¤","¢",")","(","/","_","\"","s","º","\'","|","&","»","ë","ì","t","can","˜","¸","+","¥","=","¶","¡","linebreak","„","âž","°")
dfm.stem<- dfm(newscorpus,
remove = c(swlist,stopwords("english")),
verbose=F,
stem=F)
topfeatures(dfm.stem, n=50)
set.seed(142) #keeps cloud' shape fixed
freq<-topfeatures(dfm.stem, n=500)
wordcloud(names(freq),
freq, max.words=120,
scale=c(3, .3),
colors=brewer.pal(8, "Dark2"))
}
avg_top20<-c("LuisFonsiVEVO","ChildishGambinoVEVO","Bruno Mars","David Dobrik","Logan Paul Vlogs","jbalvinVEVO","Dua Lipa","Maroon5VEVO","Ed Sheeran","TaylorSwiftVEVO","Cardi B","shakiraVEVO","joyner lucas","Collins Key","Today I Found Out","communitychannel","The Royal Family","Nikita Dragun","Desi Perkins","Tanner Braungardt")
plotcloud(data10[data10$channel_title%in%avg_top20,])



max_top20<-c("ArianaGrandeVevo","SelenaGomezVEVO","ShawnMendesVEVO","NickyJamTV","GEazyMusicVEVO","Dude Perfect","Gorillaz","ImagineDragonsVEVO","Safiya Nygaard","Dani Ochoa","AsapSCIENCE","CamilaCabelloVEVO","Diplo","Clean Bandit",
"Lil Peep","LadyGagaVEVO","CaseyNeistat","MeghanTrainorVEVO","Lucas the Spider","Ryan Is Driving")
plotcloud(data10[data10$channel_title%in%max_top20,])


